An IPython notebook that explores the relationship(correlation) between betweenness centrality and community membership of a number of mailing-lists in a given time period.
In [1]:
%matplotlib inline
In [2]:
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
from itertools import repeat
In [3]:
urls = ["http://mail.scipy.org/pipermail/ipython-dev/",
"http://mail.scipy.org/pipermail/ipython-user/",
"http://mail.scipy.org/pipermail/scipy-dev/",
"http://mail.scipy.org/pipermail/scipy-user/",
"http://mail.scipy.org/pipermail/numpy-discussion/"]
archives= [Archive(url,archive_dir="../archives") for url in urls]
The following sets start month and end month, both inclusive.
In [5]:
date_from_whole = [2010,1] #Include June(Start month)
date_to_whole = [2012,12] #Include December(End month)
total_month = (date_to_whole[0] - date_from_whole[0])*12 + (date_to_whole[1]-date_from_whole[1]+1)
In [6]:
date_from = []
date_to = []
temp_year = date_from_whole[0]
temp_month = date_from_whole[1]
for i in range(total_month):
date_from.append(pd.datetime(temp_year,temp_month,1,tzinfo=pytz.utc))
if temp_month == 12:
temp_year += 1
temp_month = 0
date_to.append(pd.datetime(temp_year,temp_month+1,1,tzinfo=pytz.utc))
temp_month += 1
In [7]:
def filter_by_date(df,d_from,d_to):
return df[(df['Date'] > d_from) & (df['Date'] < d_to)]
In [8]:
IG = []
for k in range(total_month):
dfs = [filter_by_date(arx.data,
date_from[k],
date_to[k]) for arx in archives]
bdf = pd.concat(dfs)
IG.append(graph.messages_to_interaction_graph(bdf))
#RG = graph.messages_to_reply_graph(messages)
#IG = graph.messages_to_interaction_graph(bdf)
In [9]:
bc = []
for j in range(total_month):
bc.append(pd.Series(nx.betweenness_centrality(IG[j])))
In [10]:
len(bc)
Out[10]:
new_dict is a dictionary with keys as users' names, and values of their community membership(can have different interpretation) Here the community membership for a user is defined as sum of log(Ni + 1), with Ni corresponds to the number of emails a user sent to Mailing list i.
In [31]:
new_dict = [{} for i in repeat(None, total_month)]
new_dict1 = [{} for i in repeat(None, total_month)]
for t in range(total_month):
filtered_activity = []
for i in range(5):
df = archives[i].data
fdf = filter_by_date(df,date_from[t],date_to[t])
filtered_activity.append(Archive(fdf).get_activity().sum())
for k in range(len(filtered_activity)):
for g in range(len(filtered_activity[k])):
original_key = filtered_activity[k].keys()[g]
new_key = (original_key[original_key.index("(") + 1:original_key.rindex(")")])
if new_key not in new_dict[t]:
new_dict[t][new_key] = 0
new_dict1[t][new_key] = 0
new_dict[t][new_key] += math.log(filtered_activity[k].get_values()[g]+1)
#can define community membership by changing the above line.
#example, direct sum of emails would be
new_dict1[t][new_key] += filtered_activity[k].get_values()[g]
In [47]:
for i in range(len(new_dict1)):
[x+1 for x in new_dict1[i].values()]
[np.log(x) for x in new_dict1[i].values()]
In [49]:
#check if there's name difference, return nothing if perfect.
for i in range(total_month):
set(new_dict[i].keys()).difference(bc[i].index.values)
set(bc[i].index.values).difference(new_dict[i].keys())
set(new_dict1[i].keys()).difference(bc[i].index.values)
set(bc[i].index.values).difference(new_dict1[i].keys())
In [53]:
#A list of corresponding betweenness centrality and community membership for all users, monthly
comparison = []
comparison1 = []
for i in range(len(new_dict)):
comparison.append(pd.DataFrame([new_dict[i], bc[i]]))
comparison1.append(pd.DataFrame([new_dict1[i], bc[i]]))
In [54]:
corr = []
corr1 = []
for i in range(len(new_dict)):
corr.append(np.corrcoef(comparison[i].get_values()[0],comparison[i].get_values()[1])[0,1])
corr1.append(np.corrcoef(comparison1[i].get_values()[0],comparison1[i].get_values()[1])[0,1])
In [56]:
corr1
Out[56]:
In [57]:
#Blue as sum of log, red as log of sum, respect to community membership
x = range(1,total_month+1)
y = corr
plt.plot(x, y, marker='o')
z = corr1
plt.plot(x, z, marker='o', linestyle='--', color='r')
Out[57]:
In [ ]: